In [1]:
pip install plotly
Collecting plotly
  Downloading plotly-4.9.0-py2.py3-none-any.whl (12.9 MB)
     |████████████████████████████████| 12.9 MB 645 kB/s eta 0:00:01
Requirement already satisfied: six in /Applications/anaconda3/lib/python3.8/site-packages (from plotly) (1.15.0)
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py) ... done
  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11430 sha256=e760cebf2b5e4a5fe3794e27c3e3dd103cce85d2a58e54d2dabf117d9e49e2ca
  Stored in directory: /Users/wangjiaxuan/Library/Caches/pip/wheels/c4/a7/48/0a434133f6d56e878ca511c0e6c38326907c0792f67b476e56
Successfully built retrying
Installing collected packages: retrying, plotly
Successfully installed plotly-4.9.0 retrying-1.3.3
Note: you may need to restart the kernel to use updated packages.
In [18]:
import numpy as np
import pandas as pd 
import json
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
import plotly.graph_objects as go
import datetime as dt
import plotly.express as px
import math
from PIL import Image
import glob
import seaborn as sns
import re
import os
import seaborn as sns
%matplotlib inline
In [19]:
CA =pd.read_csv('CAvideos.csv')
US =pd.read_csv('USvideos.csv')
FR =pd.read_csv('FRvideos.csv')
GB =pd.read_csv('GBvideos.csv')
IN =pd.read_csv('INvideos.csv')
In [20]:
CA['country']= 'CA'
US['country']= 'US'
FR['country']= 'FR'
GB['country']= 'GB'
IN['country']= 'IN'
df=pd.concat([CA,US,FR,GB,IN])
df.to_csv('ALL.csv')
In [21]:
df.head()
Out[21]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes comment_count thumbnail_link comments_disabled ratings_disabled video_error_or_removed description country
0 n1WpP7iowLc 17.14.11 Eminem - Walk On Water (Audio) ft. Beyoncé EminemVEVO 10 2017-11-10T17:00:03.000Z Eminem|"Walk"|"On"|"Water"|"Aftermath/Shady/In... 17158579 787425 43420 125882 https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg False False False Eminem's new track Walk on Water ft. Beyoncé i... CA
1 0dBIkQ4Mz1M 17.14.11 PLUSH - Bad Unboxing Fan Mail iDubbbzTV 23 2017-11-13T17:00:00.000Z plush|"bad unboxing"|"unboxing"|"fan mail"|"id... 1014651 127794 1688 13030 https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg False False False STill got a lot of packages. Probably will las... CA
2 5qpjK5DgCt4 17.14.11 Racist Superman | Rudy Mancuso, King Bach & Le... Rudy Mancuso 23 2017-11-12T19:05:24.000Z racist superman|"rudy"|"mancuso"|"king"|"bach"... 3191434 146035 5339 8181 https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg False False False WATCH MY PREVIOUS VIDEO â–¶ \n\nSUBSCRIBE â–º http... CA
3 d380meD0W0M 17.14.11 I Dare You: GOING BALD!? nigahiga 24 2017-11-12T18:01:41.000Z ryan|"higa"|"higatv"|"nigahiga"|"i dare you"|"... 2095828 132239 1989 17518 https://i.ytimg.com/vi/d380meD0W0M/default.jpg False False False I know it's been a while since we did this sho... CA
4 2Vv-BfVoq4g 17.14.11 Ed Sheeran - Perfect (Official Music Video) Ed Sheeran 10 2017-11-09T11:04:14.000Z edsheeran|"ed sheeran"|"acoustic"|"live"|"cove... 33523622 1634130 21082 85067 https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg False False False 🎧: https://ad.gt/yt-perfect\n💰: https://atlant... CA

1. Data Cleaning

  1. Extract the category information from the JSON File
In [22]:
df['category_id'] = df['category_id'].astype(str)


category_id = {}

with open('US_category_id.json', 'r') as f:
    data = json.load(f)
    for category in data['items']:
        category_id[category['id']] = category['snippet']['title']

df.insert(4, 'category', df['category_id'].map(category_id))
category_list = df['category'].unique()
category_list
Out[22]:
array(['Music', 'Comedy', 'Entertainment', 'News & Politics',
       'People & Blogs', 'Howto & Style', 'Film & Animation',
       'Science & Technology', 'Gaming', 'Sports',
       'Nonprofits & Activism', 'Pets & Animals', 'Travel & Events',
       'Autos & Vehicles', 'Education', 'Shows', 'Movies', 'Trailers'],
      dtype=object)
  1. Prepare data type columns
In [23]:
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')
df['publish_time'] = pd.to_datetime(df['publish_time'], infer_datetime_format=True)
  1. Add column for publish time
In [24]:
df['publish_date'] = df['publish_time'].dt.date
df['publish_wd'] = df['publish_time'].dt.weekday
df['publish_hr'] = df['publish_time'].dt.hour
df['publish_time'] = df['publish_time'].dt.time

df.head()
Out[24]:
video_id trending_date title channel_title category category_id publish_time tags views likes ... comment_count thumbnail_link comments_disabled ratings_disabled video_error_or_removed description country publish_date publish_wd publish_hr
0 n1WpP7iowLc 2017-11-14 Eminem - Walk On Water (Audio) ft. Beyoncé EminemVEVO Music 10 17:00:03 Eminem|"Walk"|"On"|"Water"|"Aftermath/Shady/In... 17158579 787425 ... 125882 https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg False False False Eminem's new track Walk on Water ft. Beyoncé i... CA 2017-11-10 4 17
1 0dBIkQ4Mz1M 2017-11-14 PLUSH - Bad Unboxing Fan Mail iDubbbzTV Comedy 23 17:00:00 plush|"bad unboxing"|"unboxing"|"fan mail"|"id... 1014651 127794 ... 13030 https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg False False False STill got a lot of packages. Probably will las... CA 2017-11-13 0 17
2 5qpjK5DgCt4 2017-11-14 Racist Superman | Rudy Mancuso, King Bach & Le... Rudy Mancuso Comedy 23 19:05:24 racist superman|"rudy"|"mancuso"|"king"|"bach"... 3191434 146035 ... 8181 https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg False False False WATCH MY PREVIOUS VIDEO â–¶ \n\nSUBSCRIBE â–º http... CA 2017-11-12 6 19
3 d380meD0W0M 2017-11-14 I Dare You: GOING BALD!? nigahiga Entertainment 24 18:01:41 ryan|"higa"|"higatv"|"nigahiga"|"i dare you"|"... 2095828 132239 ... 17518 https://i.ytimg.com/vi/d380meD0W0M/default.jpg False False False I know it's been a while since we did this sho... CA 2017-11-12 6 18
4 2Vv-BfVoq4g 2017-11-14 Ed Sheeran - Perfect (Official Music Video) Ed Sheeran Music 10 11:04:14 edsheeran|"ed sheeran"|"acoustic"|"live"|"cove... 33523622 1634130 ... 85067 https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg False False False 🎧: https://ad.gt/yt-perfect\n💰: https://atlant... CA 2017-11-09 3 11

5 rows × 21 columns

  1. Dropping some columns and removing duplicates (only include the first trending date of each video)
In [8]:
df = df.drop(['tags', 'video_error_or_removed', 'description'],axis = 1)
In [9]:
df = df.drop_duplicates(keep = 'first')
In [10]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 194340 entries, 0 to 37330
Data columns (total 18 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   video_id           194340 non-null  object        
 1   trending_date      194340 non-null  datetime64[ns]
 2   title              194340 non-null  object        
 3   channel_title      194340 non-null  object        
 4   category           194340 non-null  object        
 5   category_id        194340 non-null  object        
 6   publish_time       194340 non-null  object        
 7   views              194340 non-null  int64         
 8   likes              194340 non-null  int64         
 9   dislikes           194340 non-null  int64         
 10  comment_count      194340 non-null  int64         
 11  thumbnail_link     194340 non-null  object        
 12  comments_disabled  194340 non-null  bool          
 13  ratings_disabled   194340 non-null  bool          
 14  country            194340 non-null  object        
 15  publish_date       194340 non-null  object        
 16  publish_wd         194340 non-null  int64         
 17  publish_hr         194340 non-null  int64         
dtypes: bool(2), datetime64[ns](1), int64(6), object(9)
memory usage: 25.6+ MB
  1. setting displays to 2 decimal places for clearly reading
In [11]:
pd.options.display.float_format = '{:.2f}'.format 
In [12]:
df.describe()
Out[12]:
views likes dislikes comment_count publish_wd publish_hr
count 194340.00 194340.00 194340.00 194340.00 194340.00 194340.00
mean 2174271.77 58766.68 3159.36 6260.59 2.85 13.28
std 9604205.57 210076.27 29235.15 31640.70 1.91 6.17
min 223.00 0.00 0.00 0.00 0.00 0.00
25% 104246.50 1556.00 87.00 207.00 1.00 9.00
50% 356178.00 7882.00 336.00 929.00 3.00 15.00
75% 1199360.75 34221.25 1255.00 3540.00 4.00 18.00
max 424538912.00 5613827.00 1944971.00 1626501.00 6.00 23.00

2. Data Exploration In US

In [23]:
#create a category_name column to match the category_id
US['category_name'] = np.nan

US.loc[(US["category_id"] == 1),"category_name"] = 'Film and Animation'
US.loc[(US["category_id"] == 2),"category_name"] = 'Cars and Vehicles'
US.loc[(US["category_id"] == 10),"category_name"] = 'Music'
US.loc[(US["category_id"] == 15),"category_name"] = 'Pets and Animals'
US.loc[(US["category_id"] == 17),"category_name"] = 'Sport'
US.loc[(US["category_id"] == 19),"category_name"] = 'Travel and Events'
US.loc[(US["category_id"] == 20),"category_name"] = 'Gaming'
US.loc[(US["category_id"] == 22),"category_name"] = 'People and Blogs'
US.loc[(US["category_id"] == 23),"category_name"] = 'Comedy'
US.loc[(US["category_id"] == 24),"category_name"] = 'Entertainment'
US.loc[(US["category_id"] == 25),"category_name"] = 'News and Politics'
US.loc[(US["category_id"] == 26),"category_name"] = 'How to and Style'
US.loc[(US["category_id"] == 27),"category_name"] = 'Education'
US.loc[(US["category_id"] == 28),"category_name"] = 'Science and Technology'
US.loc[(US["category_id"] == 29),"category_name"] = 'Non Profits and Activism'
US.loc[(US["category_id"] == 25),"category_name"] = 'News & Politics'
In [14]:
US['likes_log'] = np.log(US['likes'] + 1)
US['views_log'] = np.log(US['views'] + 1)
US['dislikes_log'] = np.log(US['dislikes'] + 1)
US['comment_log'] = np.log(US['comment_count'] + 1)

plt.figure(figsize = (24,12))

plt.subplot(221)
g1 = sns.distplot(US['views_log'])
g1.set_title("Views log distribution", fontsize=16)
sns.set(font_scale=1)


plt.subplot(224)
g2 = sns.distplot(US['likes_log'], color='green')
g2.set_title('Likes log distribution', fontsize=16)
sns.set(font_scale=1)

plt.subplot(223)
g3 = sns.distplot(US['dislikes_log'], color='red')
g3.set_title("Dislikes log distribution", fontsize=16)
sns.set(font_scale=1)

plt.subplot(222)
g4 = sns.distplot(US['comment_log'], color='orange')
g4.set_title("Comments log distribution", fontsize=16)
sns.set(font_scale=1)

plt.subplots_adjust(wspace = 0.2, hspace = 0.4,top = 0.9)

plt.show()
In [24]:
#make a dataframe for category_name and number of the videos
US_count = US.category_name.value_counts().rename_axis('category_name').reset_index(name='counts')
US_count
Out[24]:
category_name counts
0 Entertainment 9964
1 Music 6472
2 How to and Style 4146
3 Comedy 3457
4 People and Blogs 3210
5 News & Politics 2487
6 Science and Technology 2401
7 Film and Animation 2345
8 Sport 2174
9 Education 1656
10 Pets and Animals 920
11 Gaming 817
12 Travel and Events 402
13 Cars and Vehicles 384
14 Non Profits and Activism 57
In [28]:
#bar chart for the category_name and counts
plt.subplots(figsize=(40,20))
sns.barplot(x='counts', y='category_name', data = US_count).set(title='US trending video number by category')
sns.set(font_scale=5)
In [30]:
#display views distribution by category names
plt.figure(figsize = (20,6))

g = sns.boxplot(x='category_name', y='views_log', data=US)
g.set_xticklabels(g.get_xticklabels(),rotation=45, size=15)
sns.set(font_scale=1)
g.set_title('Views Distribution by Category Names', fontsize=15)
g.set_xlabel('',fontsize=15)
g.set_ylabel('Views(log)',fontsize=15)
plt.show()
In [32]:
#calculate the ratio
US['like_rate'] =  US['likes'] / US['views'] * 100
US['dislike_rate'] =  US['dislikes'] / US['views'] * 100
US['comment_rate'] =  US['comment_count'] / US['views'] * 100
In [34]:
#display like rate distribution
plt.figure(figsize = (20,6))
g = sns.boxplot(x='category_name', y='like_rate', data=US)
sns.set(font_scale=1)
g.set_xticklabels(g.get_xticklabels(),rotation=45, size=15)
g.set_title('Like Rate Distribution', fontsize=15)
g.set_xlabel("", fontsize=12)
g.set_ylabel('Like rate', fontsize=12)
plt.show()
In [35]:
#display dislike rate distribution
plt.figure(figsize = (20,6))

g = sns.boxplot(x='category_name', y='dislike_rate', data=US)
sns.set(font_scale=1)
g.set_xticklabels(g.get_xticklabels(),rotation=45)
g.set_title('Dislike Rate Distribution', fontsize=15)
g.set_xlabel('', fontsize=12)
g.set_ylabel('Dislike rate', fontsize=12)
plt.show()
In [37]:
#display comment rate distribution
plt.figure(figsize = (20,6))

g = sns.boxplot(x='category_name', y='comment_rate', data=US)
g.set_xticklabels(g.get_xticklabels(),rotation=45)
sns.set(font_scale=1)
g.set_title('Comment Rate Distribution', fontsize=15)
g.set_xlabel('', fontsize=12)
g.set_ylabel('Comment rate', fontsize=12)
plt.show()
In [39]:
#check the publish year in datasets
US['publish_year']=US['publish_time'].astype(str)
US['publish_year']=US['publish_year'].str[:4]
US.publish_year.value_counts()
Out[39]:
2018    30279
2017    10428
2013       44
2016       35
2015       35
2014       32
2011       27
2012       24
2010       19
2009       14
2008       11
2006        1
Name: publish_year, dtype: int64
In [41]:
US.head(2)
Out[41]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes ... country likes_log views_log dislikes_log comment_log category_name like_rate dislike_rate comment_rate publish_year
0 2kyS6SvSYSE 17.14.11 WE WANT TO TALK ABOUT OUR MARRIAGE CaseyNeistat 22 2017-11-13T17:13:01.000Z SHANtell martin 748374 57527 2966 ... US 10.96 13.53 8.00 9.68 People and Blogs 7.69 0.40 2.13 2017
1 1ZAPwfrtAFY 17.14.11 The Trump Presidency: Last Week Tonight with J... LastWeekTonight 24 2017-11-13T07:30:00.000Z last week tonight trump presidency|"last week ... 2418783 97185 6146 ... US 11.48 14.70 8.72 9.45 Entertainment 4.02 0.25 0.53 2017

2 rows × 26 columns

In [42]:
#since only 2017 and 2018 have more data, we are going to do more research for 2017 and 2018
US_2017 = US[US['publish_year']=='2017']['category_name'].reset_index()
US_2017=US_2017.category_name.value_counts().rename_axis('category_name').reset_index(name='counts')
US_2017['year'] = '2017'
US_2017.head()
Out[42]:
category_name counts year
0 Entertainment 2507 2017
1 Music 1669 2017
2 Comedy 992 2017
3 How to and Style 979 2017
4 People and Blogs 862 2017
In [43]:
US_2018 = US[US['publish_year']=='2018']['category_name'].reset_index()
US_2018=US_2018.category_name.value_counts().rename_axis('category_name').reset_index(name = 'counts')
US_2018['year']='2018'
US_2018.head()
Out[43]:
category_name counts year
0 Entertainment 7407 2018
1 Music 4785 2018
2 How to and Style 3163 2018
3 Comedy 2457 2018
4 People and Blogs 2328 2018
In [44]:
US_year = pd.concat([US_2017,US_2018])
In [45]:
plt.subplots(figsize=(40,20))
sns.barplot(x='counts', y='category_name', hue='year', data=US_year,palette="Blues").set(title='number of videos by category and years')
sns.set(font_scale=1)
In [46]:
number_growth = US_2018.groupby('category_name')['counts'].agg('sum')/US_2017.groupby('category_name')['counts'].agg('sum')-1
number_growth = number_growth.sort_values(ascending=False).reset_index()
number_growth.columns = ['category_name', 'growth_rate']
number_growth
Out[46]:
category_name growth_rate
0 Gaming 7.77
1 Education 2.34
2 Sport 2.24
3 How to and Style 2.23
4 Non Profits and Activism 2.07
5 Film and Animation 2.04
6 Science and Technology 2.04
7 Entertainment 1.95
8 Pets and Animals 1.91
9 Music 1.87
10 People and Blogs 1.70
11 Comedy 1.48
12 Travel and Events 1.19
13 News & Politics 1.14
14 Cars and Vehicles 1.09
In [47]:
plt.figure(figsize=(20,6))
g = sns.barplot(x='category_name',y='growth_rate',data=number_growth)
g.set_xticklabels(g.get_xticklabels(),rotation=45)
g.set_title("Growth rate by video category ", fontsize=15)
g.set_xlabel('category_name',fontsize=15)
g.set_ylabel('growth_rate',fontsize=12)
plt.show()

3. Data Exploration Worldwide

In [49]:
#A.Ratio of trending videos in five countries
labels = df.groupby(['country']).count().index
sizes = df.groupby(['country']).count()['title']
explode = (0, 0, 0, 0.1, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')
cmap = plt.get_cmap('Spectral')
colors = [cmap(i) for i in np.linspace(0, 1, 8)]


plt.subplots(figsize=(10,10))
plt.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, explode=explode, startangle=100,colors=colors)
plt.show()
In [50]:
#B. Correlation of trending videos between countries
fre_df = pd.DataFrame(df.groupby(['video_id','country']).count()['title'].sort_values(ascending=False)).reset_index()
fre_df.head(), fre_df.tail()
Out[50]:
(      video_id country  title
 0       #NAME?      IN    442
 1  2z3EUY1aXdY      GB     38
 2  NooW_RbfdWI      GB     38
 3  BhIEIO0vaBE      GB     38
 4  YBpdL9hSac4      GB     37,
           video_id country  title
 80933  ZxZUi_awl4A      FR      1
 80934  ZxVFfRLQ_zA      FR      1
 80935  ZxOmJfCEgoc      CA      1
 80936  ZwvrFY23Atc      CA      1
 80937  V-AFAo13-04      FR      1)
In [51]:
video_list,max_list = list(),list()
country_list = df.groupby(['country']).count().index

for c in country_list:
    video_list.append(fre_df[fre_df['country']==c]['title'].value_counts().sort_index())
    max_list.append(max(fre_df[fre_df['country']==c]['title'].value_counts().sort_index().index))
In [52]:
fig, [ax0, ax1, ax2, ax3, ax4] = plt.subplots(nrows=5,figsize=(15, 20),)
st = fig.suptitle("How long a video trend in different countries?", fontsize=20)
st.set_y(0.9)
for i, pt in enumerate([ax0, ax1, ax2, ax3, ax4]):
    pt.plot(video_list[i].index, video_list[i])
    pt.spines['right'].set_visible(False)
    pt.spines['top'].set_visible(False)
    pt.set_xlabel("appearances",fontsize=14)
    pt.set_ylabel(country_list[i],fontsize=24)
    pt.axes.set_xlim(1, 30)
# Tweak spacing between subplots to prevent labels from overlapping
plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0)
In [53]:
#C. What is the overall ratio of Likes-Dislikes in different categories?
like_dislike_ratio = df.groupby('category')['likes'].agg('sum') / df.groupby('category')['dislikes'].agg('sum')
like_dislike_ratio = like_dislike_ratio.sort_values(ascending=False).reset_index()
like_dislike_ratio.columns = ['category','ratio']
plt.subplots(figsize=(10, 15))
sns.barplot(x="ratio", y="category", data=like_dislike_ratio,
            label="Likes-Dislikes Ratio")
Out[53]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd095f79fd0>
In [54]:
#What is the overall ratio of Views-Comments in different categories?
views_comment_ratio = df.groupby('category')['views'].agg('sum') / df.groupby('category')['comment_count'].agg('sum')
views_comment_ratio = views_comment_ratio.sort_values(ascending=False).reset_index()
views_comment_ratio.columns = ['category','ratio']
plt.subplots(figsize=(10, 15))
sns.barplot(x="ratio", y="category", data=views_comment_ratio,
            label="Views-Comments Ratio")
Out[54]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd094d45850>
In [55]:
#What is the overall ratio of Likes-Views Ratio in different categories?
view_like_ratio = df.groupby('category')['likes'].agg('sum') / df.groupby('category')['views'].agg('sum')
view_like_ratio = view_like_ratio.sort_values(ascending=False).reset_index()
view_like_ratio.columns = ['category','ratio']
plt.subplots(figsize=(10, 15))
sns.barplot(x="ratio", y="category", data=view_like_ratio,
            label="Views-Likes Ratio")
Out[55]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd0af1c0310>
In [25]:
#D. what is the categories of trending videos in US/CA/GB/FR/IN
cat_df_us = df[df['country']=='US']['category'].value_counts().reset_index()
cat_df_us['country']='US'
cat_df_ca = df[df['country']=='CA']['category'].value_counts().reset_index()
cat_df_ca['country']='CA'
cat_df_gb = df[df['country']=='GB']['category'].value_counts().reset_index()
cat_df_gb['country']='GB'
cat_df_fr = df[df['country']=='FR']['category'].value_counts().reset_index()
cat_df_fr['country']='FR'
cat_df_in = df[df['country']=='IN']['category'].value_counts().reset_index()
cat_df_in['country']='IN'
cat_df_all=pd.concat([cat_df_us,cat_df_ca,cat_df_gb,cat_df_fr,cat_df_in])
cat_df_all.head()
Out[25]:
index category country
0 Entertainment 9964 US
1 Music 6472 US
2 Howto & Style 4146 US
3 Comedy 3457 US
4 People & Blogs 3210 US
In [26]:
sns.barplot(x='category', y='index', hue='country', data=cat_df_all,palette="Blues",linewidth = 20)
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc7d57fed30>
In [60]:
sns.factorplot(x='category',y='country',col='index',col_wrap=4, kind='bar', data=cat_df_all, palette="RdPu")
/opt/anaconda3/lib/python3.7/site-packages/seaborn/categorical.py:3669: UserWarning: The `factorplot` function has been renamed to `catplot`. The original name will be removed in a future release. Please update your code. Note that the default `kind` in `factorplot` (`'point'`) has changed `'strip'` in `catplot`.
  warnings.warn(msg)
Out[60]:
<seaborn.axisgrid.FacetGrid at 0x7fd0935d1c90>
In [61]:
# E overall correlation of numeric variables 
corr_list = df[['views','likes','dislikes','comment_count']]
plt.figure(figsize=(15,10))
ax = sns.heatmap(data=corr_list.corr(),annot=True)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
Out[61]:
(4.5, -0.5)

4. Numbers over time Worldwide

In [27]:
# The like to dislike ratio to measure of the viewers' approval of the video
df["ldratio"] = df["likes"]/df["dislikes"]
In [28]:
# These variables record the extent to which people react to the video.
df["perc_comment"] = df["comment_count"] / df["views"]
df["perc_reaction"] = (df["likes"] + df["dislikes"]) / df["views"]
In [29]:
# The Numbers Over Time
# Let's visualize views, like to dislike ratio, and more over time using the trending date.
def over_time(df, var):
    averages = df[df["dislikes"] != 0].groupby("trending_date").mean()
    plt.plot(averages.index.values, averages[var])
    plt.xticks(rotation = 90)
    plt.xlabel("Date")
    plt.ylabel(f"Average {var}")
    plt.title(f"Average {var} Over Time (11/14/17 - 6/14/18)")
    plt.show()
In [30]:
over_time(df, "views")
In [31]:
# Views per trending video appeared to skyrocket beginning around February of 2018.
over_time(df, "ldratio")
In [74]:
# Some event caused the average like to dislike ratio on trending videos to decrease dramatically around January of 2018.
over_time(df, "perc_reaction") #Recall perc_reaction is (likes + dislikes) / views
In [75]:
#There was a large increase in people who liked and disliked trending videos in December of 2017, and a large decrease in May of 2018.
over_time(df, "perc_comment") #Recall perc_comment is comments / views
In [78]:
#The percent of people who comment on trending videos has been quite volatile, though exhibits similar patterns as our "perc_reation" chart (May, 2018 for example).
# What publishing time receives the most views?
by_hour = df.groupby("publish_hr").mean()
In [79]:
plt.plot(by_hour.index.values, by_hour["views"])
plt.scatter(by_hour.index.values, by_hour["views"])
plt.xlabel("Publish Hour of the Day")
plt.ylabel("Average Number of Views")
plt.title("Average Amount of Views on Trending Videos by the Hour")
plt.show()

Videos published at 4 AM received the most views on average. This may be due to the fact that people are first waking up after this time and have all day to make the video popular. 9 AM is also a good time to puublish a video when hoping for many views on the trending list. Trending videos published later in the evening usually aren't as viewed.

In [ ]:
 
In [ ]:
 
In [ ]:
 

5. Publish Date Worldwide

  1. What day of the week to publish video
In [32]:
data = df['publish_wd'].map(dict(zip(range(7),
    ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']))).value_counts()

fig1 = go.Figure(data=[go.Bar(x=data.index.values, y=data, textposition='auto')])
fig1.update_layout(title="Number of Videos Published in Weekday", yaxis=dict(title="Videos"))
fig1.show()

most of the videos are published on weekdays instead of saturday and sunday which can increase the chances of more views.

In [33]:
data = df[['title', 'channel_title', 'category_id', 'views', 'publish_wd',
          'publish_hr', 'likes', 'dislikes','country']].loc[df.views > 20000000].reset_index()
data.publish_wd = data.publish_wd.map(dict(zip(range(7),
            ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])))

def bubble_plt(target, plot_title, target_title, data):
    hover_text = []
    bubble_size = []
    for index, row in data.iterrows():
        hover_text.append(('Title: {title}<br>'+
                          'Category: {category_id}<br>'+
                          'Channel: {channel_title}<br>'+
                          'Views: {views}<br>'+
                          'Likes: {likes}<br>'+
                          'Dislikes: {dislikes}<br>'+
                          'country: {country}<br>'
                          ).format(title=row['title'],
                                  channel_title=row['channel_title'],
                                  category_id=row['category_id'],
                                  views = row['views'],
                                  likes = row['likes'],
                                  dislikes = row['dislikes'],
                                  country = row['country']))
        bubble_size.append(row[target]/row['views'])
    data['text'] = hover_text
    data['size'] = bubble_size
    fig2 = go.Figure()
    
    weekday = ['Monday', 'Tuesday', 'Wednesday', 'Thurday', 'Friday', 'Saturday', 'Sunday']
    wd_data = {wd:data.query("publish_wd == '%s'"%wd)
              for wd in weekday}
    
    #Create Figure
    for key, values in wd_data.items():
        fig2.add_trace(go.Scatter(
            x=values['views'], y=values[target]/values['views'],
            name=key, text=values['text'],
            marker_size=values['size'],
            ))
         # The following formula is recommended by https://plotly.com/python/bubble-charts/
    sizeref = 2.*max(data['size'])/(1000)
    
    fig2.update_traces(mode='markers', marker=dict(sizemode='area',sizeref=sizeref, line_width=2))
    
    fig2.update_layout(
        title=plot_title,
        xaxis=dict(
            title='Number of views in millions',
            gridcolor='white',
            type='log',
            gridwidth=2,
        ),
        yaxis=dict(
            title=target_title,
            gridcolor='white',
            gridwidth=2,
        ),
        paper_bgcolor='rgb(243, 243, 243)',
        plot_bgcolor='rgb(243, 243, 243)',
        legend = {'itemsizing': 'constant'}
    )
    
    fig2.show()
bubble_plt('likes',"like/view Ratio vs. Number of views", "Like/view Ratio", data)
In [34]:
bubble_plt('dislikes', "Dislikes/view ratio vs. Number of views", "Dislikes/view Ratio",data)
  1. Prediction--when to upload a video

Create a dataframe for modeling and a column 'day_to_trend' for number of days a video takes to get on the trending list

In [35]:
new_data = df.loc[(df.comments_disabled) &
                 (~df.ratings_disabled)].copy()

new_data['day_to_trend'] = abs(np.subtract(new_data.trending_date.dt.date,new_data.publish_date,dtype=np.float32)
                               .apply(lambda x: x.days))
left_vars = ['views','likes','dislikes','comment_count','publish_wd','publish_hr','day_to_trend','title']

new_data = new_data[left_vars]
new_data.reset_index(inplace=True)
new_data.head()
Out[35]:
index views likes dislikes comment_count publish_wd publish_hr day_to_trend title
0 70 13433 74 57 0 0 3 1 The National for Sunday, November 12, 2017
1 82 261603 4276 2148 0 6 19 2 Will Grace Davies make you love her? | Live Sh...
2 235 15800 88 0 0 0 1 2 Marie-Louise Arsenault réplique à Denise Bomba...
3 371 46905 228 29 0 1 22 1 WATCH LIVE: Attorney General Sessions testifie...
4 383 13742 81 22 0 1 3 1 The National for Monday November 13, 2017 - Ki...

Distribution Check

In [36]:
from pandas.plotting import scatter_matrix
scatter_matrix(new_data[['publish_wd', 'publish_hr', 'day_to_trend']])
plt.show()
plt.hist(new_data['day_to_trend'])
plt.title("Histogram of Original Days to Trend")
plt.show()

The bar graph shows none of them follow Gaussian distribution, but seem to follow gamma distribution. Besides, days to trend only cluster at one location, so need to narrow down into two weeks(14 days)

In [37]:
new_data_14 = new_data.loc[new_data.day_to_trend <= 14]
plt.hist(new_data_14['day_to_trend'])
plt.title("Histogram of Days to Trend After Removing values > 7")
plt.show()
In [38]:
import sklearn
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
new_data.day_to_trend = new_data.day_to_trend <= 7

RF model defining

In [39]:
def rf_model(X, y, my_pg = None):
    #perform Grid-search
    if my_pg is None:
        # tuning the hyperparameters to optimize:max depth of a tree and number of trees.
        my_pg={
            'max_depth': range(5,10), # fit each decision tree with depth ranging from 5 to 10 in the forest.
            'n_estimators': range(155,170), # choose number of trees ranging from 155 to 170 in the forest.
            }
        
    gsc = GridSearchCV(
        estimator=RandomForestClassifier(),
        param_grid = my_pg,cv=5, scoring='accuracy', verbose=0, n_jobs=-1)
    
    grid_result = gsc.fit(X,y)
    
    return grid_result.best_params_,grid_result.best_score_

Split the dataset to train set (70%) and test set (30%)

In [85]:
X = new_data[['views', 'likes', 'dislikes', 'comment_count', 'publish_wd', 'publish_hr']]
y = new_data['day_to_trend']
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=4,test_size=.3)
In [86]:
print(rf_model(X_train,y_train)) #({'max_depth':9, 'n_estimators': 168}, 0.9398826708852417)
({'max_depth': 9, 'n_estimators': 161}, 0.9409109485202031)

RF classification modeling

In [87]:
from sklearn.metrics import classification_report as cr
rfc = RandomForestClassifier(max_depth = 9, n_estimators = 168, oob_score = True, warm_start = True)
rfc.fit(X_train, y_train)
Out[87]:
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=9, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=168,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=True)

Out of Bag score

In [88]:
print(rfc.oob_score_) # 0.9383350462487153
print(rfc.score(X_test,y_test)) # 0.9436450839328537
0.9383350462487153
0.9436450839328537

importances of 'views', 'likes', 'dislikes', 'comment_count', 'publish_wd', and 'publish_hr'

In [89]:
print(rfc.feature_importances_) 
[0.24909411 0.27384333 0.28854069 0.         0.06570286 0.12281902]
In [90]:
print(pd.crosstab(pd.Series(y_train, name='Actual'), pd.Series(rfc.predict(X_train),name='predicted_train')))
pred = rfc.predict(X_train)
print(cr(y_train, pred)) # 97% accuracy
predicted_train  False  True 
Actual                       
False               26    249
True               124    972
              precision    recall  f1-score   support

       False       1.00      0.76      0.86       298
        True       0.96      1.00      0.98      1648

    accuracy                           0.96      1946
   macro avg       0.98      0.88      0.92      1946
weighted avg       0.96      0.96      0.96      1946

In [91]:
print(pd.crosstab(pd.Series(y_test,name='Actual'), pd.Series(rfc.predict(X_test), name='predicted_test')))
pred = rfc.predict(X_test)
print(cr(y_test, pred)) # 94% accuracy
predicted_test  False  True 
Actual                      
False               5     13
True               25    214
              precision    recall  f1-score   support

       False       0.97      0.67      0.79       134
        True       0.94      1.00      0.97       700

    accuracy                           0.94       834
   macro avg       0.95      0.83      0.88       834
weighted avg       0.94      0.94      0.94       834

In [92]:
# pip install scikit-plot
In [93]:
import scikitplot as skplt
from sklearn.metrics import average_precision_score, plot_precision_recall_curve
prob = rfc.predict_proba(X_test)
myplot = skplt.metrics.plot_roc(y_test, prob)
average_precision = average_precision_score(y_test, prob[:,1])
disp = plot_precision_recall_curve(rfc, X_test, y_test)
disp.ax_.set_title('2-class Precision-Recall curve:'
                  'AP={0:0.2f}'.format(average_precision))
score = metrics.f1_score(np.array(y_test),pred)
print('The f1 score for this model is {}'.format(score))
The f1 score for this model is 0.9673837612768911
In [ ]: